#Background #Talking about why I chose this topic #Why is this important? #What was my question(s)? #Talk about the data set(s) that I used

#Talk about the cleaning. This dataset I cleaned in a different file and imported the cleaned data #Doing a A little more cleaning… #Change data types to factors

#Take a quick look at the data frame using skim

Data summary
Name df
Number of rows 1356
Number of columns 13
_______________________
Column type frequency:
factor 3
numeric 10
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
gender 0 1 FALSE 2 Mal: 684, Fem: 672
smoking_status 0 1 FALSE 2 No: 894, Yes: 462
sleep_type 0 1 FALSE 3 Dee: 452, Lig: 452, REM: 452

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 226.50 130.53 1.0 113.75 226.50 339.25 452.00 ▇▇▇▇▇
age 0 1.00 40.29 13.16 9.0 29.00 40.00 52.00 69.00 ▂▇▇▇▂
sleep_duration 0 1.00 7.47 0.87 5.0 7.00 7.50 8.00 10.00 ▂▆▇▂▁
sleep_efficiency 0 1.00 0.79 0.14 0.5 0.70 0.82 0.90 0.99 ▃▃▅▇▇
awakenings 60 0.96 1.64 1.36 0.0 1.00 1.00 3.00 4.00 ▅▇▃▃▃
caffeine_consumption 75 0.94 23.65 30.18 0.0 0.00 25.00 50.00 200.00 ▇▃▁▁▁
alcohol_consumption 42 0.97 1.17 1.62 0.0 0.00 0.00 2.00 5.00 ▇▁▁▁▁
exercise_frequency 18 0.99 1.79 1.43 0.0 0.00 2.00 3.00 5.00 ▇▂▅▂▁
sleep_type_percentage 0 1.00 33.33 18.83 7.0 19.00 24.00 55.00 75.00 ▇▆▁▅▂
bedtime 0 1.00 2356.48 166.90 2100.0 2200.00 2400.00 2500.00 2630.00 ▇▅▅▅▆

#I was interested to see if bedtime would effect sleep efficiency (what that is…)

# Plot of bedtime effect on sleep efficiency
bed_plot1 <- df %>% 
  ggplot(aes(x=factor(bedtime), y=sleep_efficiency, color=factor(bedtime))) +
  geom_point() +
  scale_x_discrete(labels=c("9:00 p.m.", "9:30 p.m.", "10:00 p.m.", "10:30 p.m.", "11:00 p.m.", "12:00 a.m.",
                            "12:30 a.m.", "1:00 a.m.", "1:30 a.m.", "2:00 a.m.", "2:30 a.m.")) +
  labs(x="Bedtime", y="Sleep Efficiency Proportion", title="Scatterplot of Bedtime's Effect on Sleep Efficiency") +
  theme_minimal()

# Interactive plot
ggplotly(bed_plot1)

#This didn’t show what I thought it would. It doesn’t appear like sleep efficiency is effected much by bedtime #Let’s create some models to see what does effect sleep efficiency! # I want to see the effects from REM sleep from the sleep_type_percentage column

rem_df <- df %>% 
  filter(sleep_type == "REM")

mod1REM <- glm(data=rem_df, formula = sleep_efficiency ~ age + gender + 
             sleep_duration + awakenings + caffeine_consumption + 
             alcohol_consumption + smoking_status + exercise_frequency + sleep_type_percentage + bedtime)
tidy(mod1REM) %>% 
  kableExtra::kable() %>% 
  kableExtra::kable_classic(lightable_options = 'hover')
term estimate std.error statistic p.value
(Intercept) 0.8653406 0.0911912 9.4892980 0.0000000
age 0.0013841 0.0003856 3.5899924 0.0003744
genderMale 0.0082960 0.0108749 0.7628562 0.4460262
sleep_duration -0.0023318 0.0055513 -0.4200353 0.6746989
awakenings -0.0475231 0.0038185 -12.4454136 0.0000000
caffeine_consumption 0.0001605 0.0001775 0.9044667 0.3663256
alcohol_consumption -0.0237291 0.0031003 -7.6537166 0.0000000
smoking_statusYes -0.0781570 0.0106176 -7.3610714 0.0000000
exercise_frequency 0.0117639 0.0037580 3.1303375 0.0018822
sleep_type_percentage 0.0016225 0.0014384 1.1279679 0.2600508
bedtime -0.0000210 0.0000322 -0.6519992 0.5147990

#REM is not significant on sleep_efficiency, so we will not use that one

#Instead I tried Deep Sleep

deep_df <- df %>% 
  filter(sleep_type == "Deep") # Time in deep sleep significant. REM is not

mod1 <- lm(data=deep_df, formula = sleep_efficiency ~ age + gender + 
                sleep_duration + awakenings + caffeine_consumption + 
                alcohol_consumption + smoking_status + exercise_frequency + sleep_type_percentage + bedtime)
tidy(mod1) %>% 
  kableExtra::kable() %>% 
  kableExtra::kable_classic(lightable_options = 'hover')
term estimate std.error statistic p.value
(Intercept) 0.5812434 0.0582704 9.9749417 0.0000000
age 0.0011359 0.0002612 4.3491404 0.0000176
genderMale -0.0033367 0.0073479 -0.4540953 0.6500216
sleep_duration 0.0017794 0.0037713 0.4718332 0.6373188
awakenings -0.0323604 0.0026860 -12.0479598 0.0000000
caffeine_consumption 0.0003140 0.0001200 2.6161485 0.0092502
alcohol_consumption -0.0078486 0.0022348 -3.5120300 0.0004986
smoking_statusYes -0.0436934 0.0073560 -5.9398190 0.0000000
exercise_frequency 0.0069996 0.0025578 2.7365283 0.0065029
sleep_type_percentage 0.0051885 0.0002460 21.0909567 0.0000000
bedtime -0.0000284 0.0000218 -1.3066559 0.1921261

#Deep Sleep is significant #All others that are *: Age, Awakenings, Alcohol Consumption, Smoking_Status(Yes) #: Caffeine_consumption, exercise_frequency #Things that surprised me: Gender not making a difference, Bedtime of course, Sleep duration (I would have thought that the longer you slept the more quality of sleep you’d have)

#Plot ** and *** #age

p1 <- deep_df %>% 
  ggplot(aes(x=age, y = sleep_efficiency)) + 
  geom_point(color="steelblue") +
  geom_smooth(color="darkgreen", se=FALSE) +
  theme_minimal()
ggplotly(p1)
# Another look
deep_df %>% 
  ggplot(aes(x=age, y = sleep_efficiency, color=factor(awakenings))) +
  geom_point() +
  geom_smooth(se=FALSE, method="lm") 

#awakenings

p2 <- deep_df %>% 
  filter(!is.na(awakenings)) %>% 
  ggplot(aes(x=factor(awakenings), y = sleep_efficiency, fill=factor(awakenings))) + # Remove the legend. Get rid of NA
  geom_violin() +
  theme_minimal() +
  scale_fill_brewer(palette = "Dark2")
ggplotly(p2)
df %>% 
  filter(!is.na(awakenings)) %>% 
  ggplot(aes(x=factor(awakenings), y = sleep_efficiency, fill=factor(awakenings))) + # Drop NA. Maybe better as a color or facet
  geom_boxplot()

#alcohol

p3 <- deep_df %>% 
  filter(!is.na(alcohol_consumption)) %>% 
  ggplot(aes(x=factor(alcohol_consumption), y = sleep_efficiency, fill=factor(alcohol_consumption))) + # Remove the legend. Get rid of NA
  geom_violin() +
  theme_minimal() +
  scale_fill_brewer(palette = "Dark2")
ggplotly(p3)
deep_df %>% 
  filter(!is.na(alcohol_consumption)) %>% 
  ggplot(aes(x=factor(alcohol_consumption), y = sleep_efficiency, fill=factor(alcohol_consumption))) + 
  geom_boxplot()

#smoking

p4 <- deep_df %>% 
  ggplot(aes(x=factor(smoking_status), y = sleep_efficiency, fill=factor(smoking_status))) + # Remove the legend. Get rid of NA
  geom_violin() +
  theme_minimal() +
  scale_fill_brewer(palette = "Dark2")
ggplotly(p4)

#exercise

p5 <- deep_df %>% 
  filter(!is.na(exercise_frequency)) %>% 
  ggplot(aes(x=factor(exercise_frequency), y = sleep_efficiency, fill=factor(exercise_frequency))) + # Remove the legend. Get rid of NA
  geom_violin() + 
  theme_minimal() +
  scale_fill_brewer(palette = "Paired")
ggplotly(p5)
df %>% 
  filter(!is.na(exercise_frequency)) %>% 
  ggplot(aes(x=factor(exercise_frequency), y = sleep_efficiency, fill=factor(exercise_frequency))) +
  geom_boxplot()

#deep sleep

deep_df %>% 
  ggplot(aes(x=sleep_type_percentage, y = sleep_efficiency)) +
  geom_point() +
  geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

deep_df %>% 
  filter(!is.na(awakenings)) %>% 
  ggplot(aes(x=sleep_type_percentage, y = sleep_efficiency, color=factor(awakenings))) +
  geom_point() +
  geom_smooth(se=FALSE, method="lm") +
  theme_minimal() +
  scale_color_brewer(palette = "Set2") +
  facet_wrap(~factor(awakenings), scales="free") # makes 0 awakenings kind of funny
## `geom_smooth()` using formula = 'y ~ x'

#caffeine

deep_df %>% 
  filter(!is.na(caffeine_consumption)) %>% 
  ggplot(aes(x=factor(caffeine_consumption), y = sleep_efficiency, fill=factor(caffeine_consumption))) + # Drop NA. Probably better as a color or facet
  geom_boxplot()

#awakenings,alcohol,smoking

df %>% 
  filter(!is.na(alcohol_consumption)) %>% 
  ggplot(aes(x=awakenings, y = sleep_efficiency, color=factor(alcohol_consumption))) + # Remove the legend. Get rid of NA
  geom_smooth(method="lm", se=FALSE) +
  facet_wrap(~smoking_status, scales="free") +
  scale_color_brewer(palette = "Dark2") +
  theme_minimal()

#More modeling

mod1 <- lm(data=deep_df, formula = sleep_efficiency ~ bedtime)
mod2 <- lm(data=deep_df, formula = sleep_efficiency ~ 
             age + awakenings + sleep_type_percentage)
mod3 <- lm(data=deep_df, formula = sleep_efficiency ~ 
             age * awakenings * sleep_type_percentage)
mod4 <- lm(data=deep_df, formula = sleep_efficiency ~
             age + awakenings + alcohol_consumption + smoking_status +
             sleep_type_percentage)

#Find the “Find the name” residuals Mod1

mean(mod1$residuals^2)
## [1] 0.01786273

Mod2

mean(mod2$residuals^2)
## [1] 0.004803478

Mod3

mean(mod3$residuals^2)
## [1] 0.004494944

Mod4

mean(mod4$residuals^2)
## [1] 0.004279514

Which is the best?

compare_performance(mod1, mod2, mod3, mod4)
## When comparing models, please note that probably not all models were fit
##   from same data.
## # Comparison of Model Performance Indices
## 
## Name | Model |   AIC (weights) |  AICc (weights) |   BIC (weights) |    R2 | R2 (adj.) |  RMSE | Sigma
## ------------------------------------------------------------------------------------------------------
## mod1 |    lm |  -530.6 (<.001) |  -530.5 (<.001) |  -518.3 (<.001) | 0.021 |     0.019 | 0.134 | 0.134
## mod2 |    lm | -1070.2 (<.001) | -1070.1 (<.001) | -1049.9 (0.083) | 0.739 |     0.737 | 0.069 | 0.070
## mod3 |    lm | -1090.9 (0.997) | -1090.5 (0.996) | -1054.3 (0.752) | 0.756 |     0.752 | 0.067 | 0.068
## mod4 |    lm | -1079.5 (0.003) | -1079.2 (0.004) | -1051.3 (0.165) | 0.768 |     0.765 | 0.065 | 0.066
compare_performance(mod1, mod2, mod3, mod4) %>% 
  plot()
## When comparing models, please note that probably not all models were fit
##   from same data.

#Mod3 appears to be the best

#Add predictions

df2 <- add_predictions(deep_df, mod3)

#Make some hypothetical values for the independent variables in the model

newdf <- data.frame(age = c(70, 8, 5, 80),
                    awakenings = c(5, 1, 6, 3),
                    sleep_type_percentage = c(49, 18, 45, 77))

#Make predictions

pred <- predict(mod3, newdata=newdf)

#Combine hypothetical input data with hypothetical predictions into one new data frame

hyp_preds <- data.frame(age = newdf$age,
                        awakenings = newdf$awakenings,
                        sleep_type_percentage = newdf$sleep_type_percentage,
                        pred=pred)

#Add new column showing whether a data point is real or hypothetical

df2$prediction_type <- "Real"
hyp_preds$prediction_type <- "Hypothetical"

#Join real data and hypothetical data (with model predictions)

fullpreds <- full_join(df2, hyp_preds)

7. Plot the predictions alongside real data.

ggplot(fullpreds, aes(x = sleep_type_percentage, y = pred, color = prediction_type)) +
  geom_point(aes(y = sleep_efficiency), color = "Black", width = 1) +
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  theme_minimal() 
## `geom_smooth()` using formula = 'y ~ x'

References

National Sleep Foundation. (2024). Why do we need sleep? Sleep Foundation. Retrieved from https://www.sleepfoundation.org/how-sleep-works/why-do-we-need-sleep